# Preparation
dev.off()
cat("\014")
# globals
options(scipen=999)
setwd("./code/do")
# packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, broom, hrbrthemes, plm, estimatr, sandwich, lmtest, AER, lfe, huxtable, margins, readstata13, texreg, xtable, reshape2,
readxl, readtext, quanteda, pdftools, stringi, lubridate, sp, maptools, rgeos, rgdal, spdep, sf)
sf::sf_use_s2(FALSE)
dev.set(dev.next())
dev.set(dev.next())
dev.set(dev.next())
source("01_spatial_aggregation_meetup.R")
dev.set(dev.next())
dev.set(dev.next())
dev.set(dev.next())
source("02_create_maindata.R")
dev.set(dev.next())
dev.set(dev.next())
dev.set(dev.next())
source("03_extend_maindata.R")
dev.set(dev.next())
dev.set(dev.next())
dev.set(dev.next())
300*22
1.2*22
35*22
30*22
1.75*22
600*22
.25*25
120*25
35*25
15*25
400*25
50*25
25*25
# Preparation
rm(list = ls())
dev.off()
cat("\014")
# globals
options(scipen=999)
# packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, broom, devtools, remotes, hrbrthemes, plm, estimatr, sandwich, lmtest, AER, lfe, huxtable, margins, readstata13, texreg, reshape2, readxl, xtable, haven, ggrepel, ggxmean, magrittr, ggbiplot, viridis)
# tabulizer for zukunftsatlas
remotes::install_github(c("ropensci/tabulizerjars", "ropensci/tabulizer"), INSTALL_opts = "--no-multiarch", dependencies = c("Depends", "Imports"))
library(tabulizer)
library(dplyr)
set.seed(1115)
# ggplot theme
# theme_set(theme_bw() + theme(text = element_text(size=14)))
# functions
normalize01 <- function(x, ...){(x - min(x, ...)) / (max(x, ...) - min(x, ...))}
# often used code
# filter(row_number(value) == 1
# which(colnames(df)==varnameinquotes)
# stratified sample: df %>% group_by(stratify_by_me) %>% sample_n(size = n)
# dplyr::relocate()
# df.mtcars % mutate(brand = as.factor(brand), brand = reorder(brand, mpg))
# define paths
cwpath <- "~/Dropbox/crosswalks/"
datpath <- "~/Dropbox/projects/opportunitymap/analysis/data/"
shapepath <- "~/Dropbox/projects/opportunitymap/map/shapefiles/vg2500_geo84/"
outpath <- "~/Dropbox/projects/opportunitymap/overleaf/out/"
# load data
load(paste0(datpath, "dfimp.RData"))
df <- dfimp %>% mutate(urbanity3 = ntile(popdens,3))
# fix missing Goettingen value
df$nr_artists_sc <- ifelse(is.na(df$nr_artists_sc), df$nr_artists/(df$population.bbsr/1000), df$nr_artists_sc)
df$nr_artists_sc_log <- log(df$nr_artists_sc+1)
df$nr_jobs_sc <- ifelse(is.na(df$nr_jobs_sc), df$number_of_jobs_total/(df$population.bbsr/1000), df$nr_jobs_sc)
# fix child care quality issue
df$ccare_quality <- ifelse(df$ccare_quality>100, df$ccare_quality/1000, df$ccare_quality)
# add long term unemp
setClass("num.with.commas")
setAs("character", "num.with.commas",
function(from) as.numeric(gsub(",", ".", from) ) )
lt_unemp <- read.csv(paste0(datpath, "ltunemp_krs.csv"), sep=";", colClasses = c("character", "numeric", "num.with.commas"))
lt_unemp  <- lt_unemp[1:401,2:3]
names(lt_unemp) <- c("GKZ", "lt_unemp_share")
df <- merge(df, lt_unemp, by="GKZ", all.x=TRUE)
# merge additional data
df$krs <- substr(df$GKZ, 1, nchar(df$GKZ)-3)
# theater
theater <- read.csv(paste0(datpath, "theater_by_kreis.csv"))
theater$krs[theater$krs==3152] <- 3159
theater %<>% dplyr::rename(theater_n=n) %>% dplyr::select(krs, theater_n)
df <- merge(df, theater, by="krs", all.x=T)
df$theater_n <- ifelse(is.na(df$theater_n), 0, df$theater_n)
# nightclub
nightclub <- read.csv(paste0(datpath, "nightclub_by_kreis.csv"))
nightclub$krs[nightclub$krs==3152] <- 3159
nightclub %<>% dplyr::rename(nightclub_n=n) %>% dplyr::select(krs, nightclub_n)
df <- merge(df, nightclub, by="krs", all.x=T)
df$nightclub_n <- ifelse(is.na(df$nightclub_n), 0, df$nightclub_n)
# playground
playground <- read.csv(paste0(datpath, "playground_by_kreis.csv"))
playground$krs[playground$krs==3152] <- 3159
playground %<>% dplyr::rename(playground_n=n) %>% dplyr::select(krs, playground_n)
df <- merge(df, playground, by="krs", all.x=T)
df$playground_n <- ifelse(is.na(df$playground_n), 0, df$playground_n)
# harmonized scaling of variables
df$artists_pc <- log(df$nr_artists+1)/log(df$population.bbsr)
df$theaters_pc <- log(df$theater_n+1)/log(df$population.bbsr)
df$nightclubs_pc <- log(df$nightclub_n+1)/log(df$population.bbsr)
df$playgrounds_pc <- log(df$playground_n+1)/log(df$population.bbsr)
cor_vars <- df %>% select(artists_pc, theaters_pc, nightclubs_pc, playgrounds_pc)
cor(cor_vars)
# commuting and public transport
maxpendel <- read.csv(paste0(datpath, "maxpendel_by_krs.csv"))
maxpendel$krs[maxpendel$krs==3152] <- 3159
# clean one case with two pendel destinations
rm_row <- which(maxpendel$krs == 5374 & maxpendel$krs_max_pendel==5378)
maxpendel <- maxpendel[-rm_row,]
df <- merge(df, maxpendel[,-1], by="krs", all.x=T)
df$maxpendel_sc <- df$n_max_pendel/(df$population.bbsr)
pub_trans <- read.csv(paste0(datpath, "oev_by_krs.csv"))
pub_trans$krs[pub_trans$krs==3152] <- 3159
df <- merge(df, pub_trans[,-1], by="krs", all.x=T)
df$remoteness <- (log(df$p_ozmz_miv) + log(df$p_ozmz_oev))/2
# basic services index
df$service_provision <- (df$bsup_doc + df$bsup_pharmacy + df$bsup_pschool + df$bsup_smarket)/4
# Principal Component Analysis
# full set of vars
opm_vars <- df %>%
select(Kommune, urbanity3,
wpcentr, #influx,
pendel, broadband, # mobility and connectivity
nr_jobs_sc, share_high_skill, # jobs
pp, sgb2, lt_unemp_share, # wealth
offences, pmdeath, # security and health
# recreation_area, ccare_quality,
# playgrounds_pc, # nature and childcare
artists_pc, theaters_pc, nightclubs_pc, # culture
oenv, # public transport
# remoteness, # commuting time to next center
service_provision # provision of basic services
) %>%
na.omit()
opm.pca <- prcomp(opm_vars[,3:length(opm_vars)], center = TRUE, scale. = TRUE)
summary(opm.pca)
plot(opm.pca)
# PCA - Leave one out
opm.pca <- prcomp(opm_vars[,3:length(opm_vars)], center = TRUE, scale. = TRUE)
summary(opm.pca)
benchmark <- cumsum(opm.pca$sdev^2 / sum(opm.pca$sdev^2))[3]
var <- "var"
cum <- "cum"
cum_all <- cbind(var, cum)
for (i in 1:(length(opm_vars)-2)){
temp <- prcomp(opm_vars[,3:length(opm_vars)][-i], center = TRUE, scale. = TRUE)
temp_cum <- cumsum(temp$sdev^2 / sum(temp$sdev^2))[3]
var <- colnames(opm_vars[,3:length(opm_vars)])[i]
temp_cum <- cbind(var, temp_cum)
cum_all <- rbind(cum_all, temp_cum)
}
cum_all <- data.frame(cum_all[2:nrow(cum_all),])
cum_all$cum <- as.numeric(as.character(cum_all$cum))
cum_all %>%
mutate(var_ordered = fct_reorder(var, cum)) %>%
ggplot(aes(x=cum, y=var_ordered)) +
geom_point() + geom_vline(aes(xintercept=benchmark)) +
ylab("") + xlab("Explained cumulative variance if [var] is left out vs. benchmark including all [vars]")
ggbiplot(opm.pca, choices=1:2)
ggbiplot(opm.pca, labels=opm_vars$Kommune)
ggbiplot(opm.pca, choices=c(1,3))
ggbiplot(opm.pca, choices=2:3)
ggbiplot(opm.pca, choices=2:3, labels=opm_vars$Kommune)
ggbiplot(opm.pca, choices=c(1,4))
# work // security+prosperity // leisure+culture
ggbiplot(opm.pca)
ggsave(paste0(outpath, "pca_12.pdf"))
ggbiplot(opm.pca, choices=c(1,3), labels=opm_vars$Kommune)
ggbiplot(opm.pca, choices=c(1,3))
ggsave(paste0(outpath, "pca_13.pdf"))
ggbiplot(opm.pca, choices=2:3)
ggsave(paste0(outpath, "pca23.pdf"))
ggbiplot(opm.pca, labels=opm_vars$Kommune)
ggsave(paste0(outpath, "pca_12_label.pdf"))
ggbiplot(opm.pca, groups=factor(opm_vars$urbanity3)) +
scale_color_manual("Population Density",values=c("orange","red", "green"), labels=c("low", "mid", "high"))
ggbiplot(opm.pca, choices=c(2,3), groups=factor(opm_vars$urbanity3)) +
scale_color_manual("Population Density",values=c("orange","red", "green"), labels=c("low", "mid", "high"))
ggbiplot(opm.pca, ellipse=TRUE, groups=factor(opm_vars$urbanity3)) +
scale_color_manual("Population Density",values=c("cadetblue1","cornflowerblue", "darkblue"), labels=c("low", "mid", "high"))
ggsave(paste0(outpath, "pca_12_by_urbanity.pdf"))
# create index ----
# pca version ----
df$pc1 <- scale(opm.pca$x[,1]) * (-1) # recode so that higher values are positive
df$pc2 <- scale(opm.pca$x[,2])
df$pc3 <- scale(opm.pca$x[,3])
df$pc1_01 <- normalize01(df$pc1)
df$pc2_01 <- normalize01(df$pc2)
df$pc3_01 <- normalize01(df$pc3)
# 3D visualization
persp(df$pc1_01, df$pc2_01, df$pc3_01)
df$prop_var1 <- (opm.pca$sdev^2 / sum(opm.pca$sdev^2))[1]
df$prop_var2 <- (opm.pca$sdev^2 / sum(opm.pca$sdev^2))[2]
df$prop_var3 <- (opm.pca$sdev^2 / sum(opm.pca$sdev^2))[3]
df$pc_w1 <- df$prop_var1/(df$prop_var1+df$prop_var2+df$prop_var3)
df$pc_w2 <- df$prop_var2/(df$prop_var1+df$prop_var2+df$prop_var3)
df$pc_w3 <- df$prop_var3/(df$prop_var1+df$prop_var2+df$prop_var3)
# create index ----
# weighted sum of first three components
df$opportunity_pc <- (df$pc1*df$pc_w1) + (df$pc2*df$pc_w2) + (df$pc3*df$pc_w3)
# scaled to 0 - 1
df$opportunity_pc01 <- normalize01(df$opportunity_pc)
# add nicest neighbor
nn <- df %>% dplyr::select(krs, krs_max_pendel, maxpendel_sc)
nn <- merge(nn, df[, c("krs", "opportunity_pc01", "pc1_01", "pc2_01", "pc3_01")], by.x="krs_max_pendel", by.y="krs")
nn <- nn %>% arrange(krs) %>% select(-krs_max_pendel) %>%
dplyr::rename(nn_weight=maxpendel_sc,
nn_opportunity_pc01=opportunity_pc01,
nn_pc1_01=pc1_01,
nn_pc2_01=pc2_01,
nn_pc3_01=pc3_01)
df <- merge(df, nn, by="krs", all.x=TRUE)
nn
df %>% select(Kommune, starts_with("nn"), opportunity_pc01)
df %>% select(Kommune, starts_with("nn"), opportunity_pc01) %>% arrange(nn_weight)
df %>% select(Kommune, starts_with("nn"), opportunity_pc01) %>% arrange(desc(nn_weight))
df %>% select(Kommune, starts_with("nn"), opportunity_pc01, opportunity_pc01_wn) %>% arrange(desc(nn_weight))
# weighted by nicest neighbor
df$opportunity_pc01_wn <- df$opportunity_pc01 + (df$nn_opportunity_pc01 * df$nn_weight)
df %>% select(Kommune, starts_with("nn"), opportunity_pc01, opportunity_pc01_wn) %>% arrange(desc(nn_weight))
0.57 + 0.16*0.99)
0.57 + 0.16*0.99
plot(df$opportunity_pc01_wn, df$turnout)
ggplot(df, aes(x=opportunity_pc01_wn, y=turnout)) + geom_point() +   geom_smooth(method='lm', formula= y~x)
top_opc_wn <- df %>% select(Kommune, opportunity_pc01_wn) %>% slice_max(opportunity_pc01_wn, n=10)
bottom_opc_wn <- df %>% select(Kommune, opportunity_pc01_wn) %>% slice_min(opportunity_pc01_wn, n=10)
topbottom_opc_wn <- cbind(top_opc_wn, bottom_opc_wn)
names(topbottom_opc_wn) <- c("Place", "Top-10", "Place", "Bottom-10")
xtable(topbottom_opc_wn)
# REPLICATION
# Place-Based Campaigning: The Political Impact of Real Grassroots Mobilization
# Daniel Bischof and Thomas Kurer
# Journal of Politics
# Figures produced in R
# Descriptives of Events over time presented in Figure 1
# Text Analysis presented in Figure 3, Figure A.1 and Table A.2
# ggplot theme
theme_set(theme_bw() + theme(text = element_text(size=14)))
# define colors to match Stata scheme
DB_blue <- rgb(32, 86, 165, maxColorValue = 255)
DB_bluelight <- rgb(79, 190, 255, maxColorValue = 255)
DB_gold <- rgb(179, 111, 20, maxColorValue = 255)
DB_greenish <- rgb(179, 176, 46, maxColorValue = 255)
DB_grey <- rgb(109, 137, 153, maxColorValue = 255)
DB_orange <- rgb(223, 169, 90, maxColorValue = 255)
DB_red <- rgb(255, 105, 110, maxColorValue = 255)
DB_red2 <- rgb(204, 43, 49, maxColorValue = 255)
DB_redlight <- rgb(255, 158, 143, maxColorValue = 255)
DB_turqoise <- rgb(79, 255, 225, maxColorValue = 255)
# Descriptives (Figure 1)
m5s_data <- readtext("./../../data_original/M5S/0_events_20181206.csv", text_field = "event_description")
m5s_data$event_year <- as.numeric(substr(m5s_data$time,1,4))
# some missing
m5s_data$event_year_sub <- as.numeric(substr(m5s_data$event_created,1,4))
m5s_data$event_year <- ifelse(is.na(m5s_data$event_year), m5s_data$event_year_sub, m5s_data$event_year)
m5s_data$event_month <- as.numeric(substr(m5s_data$time,6,7))
m5s_data$event_day <- as.numeric(substr(m5s_data$time,9,10))
m5s_data$event_date <- as.Date(paste(m5s_data$event_year, m5s_data$event_month, m5s_data$event_day, sep="-"))
# Nr of groups over time
groups <- m5s_data %>%
dplyr::select(group_id, group_created, group_name, group_location, group_url) %>%
group_by(group_id) %>% dplyr::filter(row_number(group_id) == 1) %>%
dplyr::rename(id=group_id, date=group_created,name=group_name, loc=group_location, url=group_url)
groups$date <- as.Date(substr(groups$date, 1, 10))
groups$one <- 1
groups <- groups %>% ungroup() %>% arrange(date) %>% mutate(one=1, nrgroups=cumsum(one))
# Nr of events over time
monthly <- m5s_data %>% dplyr::mutate(one=1) %>%
group_by(date=floor_date(event_date, "month")) %>%
dplyr::summarise(nr=sum(one)) %>%
dplyr::filter(!is.na(date))
monthly$year <- substr(monthly$date,1,4)
monthly$month <- substr(monthly$date,6,7)
# Plots ----
# CUMULATIVE NR OF GROUPS
ggplot(groups, aes(x=date, y=nrgroups)) +
# EVENTS
# Grillo suggests use of Meetup
geom_segment(aes(x = as.Date('16/07/2005', format="%d/%m/%Y"), y=0, xend=as.Date('16/04/2005', format="%d/%m/%Y"), yend=150), linetype="dashed", color="black") +
annotate("text", as.Date('16/05/2005', format="%d/%m/%Y"), y=180, label = "Grillo suggests \n use of Meetup") +
# V-Day
geom_segment(aes(x = as.Date('08/09/2007', format="%d/%m/%Y"), y=80, xend=as.Date('08/05/2007', format="%d/%m/%Y"), yend=300), linetype="dashed", color="black") +
annotate("text", as.Date('08/05/2007', format="%d/%m/%Y"), y=320, label = "'V-Day'") +
# Referendum
geom_segment(aes(x = as.Date('04/12/2016', format="%d/%m/%Y"), y=900, xend=as.Date('04/09/2016', format="%d/%m/%Y"), yend=975), linetype="dotted", color="black") +
annotate("text", as.Date('04/09/2016', format="%d/%m/%Y"), y=985, label = "Referendum") +
# General Election
geom_segment(aes(x = as.Date('24/02/2013', format="%d/%m/%Y"), y=400, xend=as.Date('24/11/2012', format="%d/%m/%Y"), yend=555), linetype="dotted", color="black") +
annotate("text", as.Date('24/11/2012', format="%d/%m/%Y"), y=575, label = "General Election") +
# Local Elections 2012
geom_segment(aes(x = as.Date('06/05/2012', format="%d/%m/%Y"), y=180, xend=as.Date('06/01/2012', format="%d/%m/%Y"), yend=355), linetype="dotted", color="black") +
annotate("text", as.Date('06/01/2012', format="%d/%m/%Y"), y=375, label = "Local Elections") +
# NR OF GROUS IN FRONT
geom_line() +
# SCALES, LABEL, THEME
scale_x_date(date_breaks = "6 month",
limits = as.Date(c('01/01/2004', '01/01/2019'), format="%d/%m/%Y"), expand=c(0,0),
date_labels="%b-%Y" ) +
xlab("") + ylab("") +
theme_bw() + theme(text = element_text(size=20), axis.text.x = element_text(angle = 45, hjust = 1))
ggsave("./../../results/figures/fig1a_cum_nr_groups.pdf", height=5, width=8.5)
# MONTHLY NUMBER OF EVENTS
ggplot(monthly, aes(x=date, y=nr)) +
geom_point(alpha=0.2) +
geom_line(alpha=0.2) +
geom_smooth(color="black", method="loess") +
# EVENTS
# Referendum
geom_segment(aes(x = as.Date('04/12/2016', format="%d/%m/%Y"), y=0, xend=as.Date('04/12/2016', format="%d/%m/%Y"), yend=4200), linetype="dotted", color="gray") +
annotate("text", as.Date('04/12/2016', format="%d/%m/%Y"), y=4300, label = "Referendum") +
# SCALES ETC
scale_x_date(date_breaks = "6 month",
limits = as.Date(c('01/01/2004', '01/01/2019'), format="%d/%m/%Y"), expand=c(0,0),
date_labels="%b-%Y" ) +
xlab("") + ylab("") +
theme_bw() + theme(text = element_text(size=20), axis.text.x = element_text(angle = 45, hjust = 1))
ggsave("./../../results/figures/fig1b_nr_events_month.pdf", height=5, width=8.5)
# Text Analysis (Figure 3, Figure A.1, Table A.2) ----
# load data
textanalysis <- readtext("./../../data_original/M5S/0_events_20181206.csv", text_field = "event_description")
# basic cleaning
textanalysis$text <- tolower(textanalysis$text)
textanalysis$text <- stri_replace_all(textanalysis$text, " ", regex = "<.*?>")   # remove html tags
textanalysis$text <- gsub("l'", " ", textanalysis$text) # remove shortened article
textanalysis$text <- stri_trans_general(textanalysis$text, "Latin-ASCII") # remove umlaute: già -> gia
textanalysis$event_year <- as.numeric(substr(textanalysis$time,1,4))
# some missing
textanalysis$event_year_sub <- as.numeric(substr(textanalysis$event_created,1,4))
textanalysis$event_year <- ifelse(is.na(textanalysis$event_year), textanalysis$event_year_sub, textanalysis$event_year)
textanalysis$event_month <- as.numeric(substr(textanalysis$time,6,7))
textanalysis$event_day <- as.numeric(substr(textanalysis$time,9,10))
textanalysis$event_date <- as.Date(paste(textanalysis$event_year, textanalysis$event_month, textanalysis$event_day, sep="-"))
textanalysis <- textanalysis %>% dplyr::filter(!is.na(event_year))
corp <- corpus(textanalysis)  # build a new corpus from the texts
summary(corp)
texts(corp)[1]
# tokenization: remove stopwords etc.
toks <- tokens(corp, remove_punct=TRUE, remove_url=TRUE, remove_symbols = TRUE) # do not yet remove numbers --> m 5 stelle..
toks <- tokens_compound(toks, pattern = phrase(c(
'beppe grillo',
'movimento 5 stelle', '5 stelle', 'movimento cinque stelle', 'cinque stelle',
'raccolta firme', 'raccolta di firme')))
toks <- tokens(toks, remove_numbers = TRUE) # after compounding, remove numbers
stopwords_it <- stri_trans_general(stopwords('it'), "Latin-ASCII") # remove umlaute: già -> gia
stopwords_broad <- c(stopwords_it,
'quindi',
"c'è",
"c'e",
'de',
'ogni',
'poi',
'ancora',
'può',
'puo',
'qui',
'solo',
'link',
'gt',
'pdf',
'and',
'amp',
'for',
'more',
'information',
'see',
'n',
'gennaio', 	'luglio', 	'febbraio',  	'agosto', 	'marzo', 	'settembre',
'aprile', 	 	'ottobre', 	'maggio', 	 	'novembre', 	'giugno', 	 	'dicembre',
'varie',
'eventuali',
'odg',
'o.d.g.',
'o.d.g',
'ore',
'via',
'masked',
'giorno',
'meetup',
'http',
'the',
'00',
'of',
'san',
'essere',
'fare',
'cosa',
'sempre',
'due',
'primo',
'domenica', 'sabato', 'venerdi', 'giovedi', 'mercoledi', 'martedi', 'lunedi',
'dopo',
'altri', 'altre',
'seguente',
'serata',
'dare',
'lt',
'It',
'gia',
'senza',
'prossimo', 'prossime',
'data', 'ora', 'settimana',
'quali',
'inoltre',
'nuova', 'nuovo',
'marco',
'fatto',
'tutta', 'tutto', 'tutti',
'zero',
'ecc'
)
toks <- tokens_remove(toks, pattern = stopwords_broad)
dfm <- dfm(toks)
dfmy <- dfm_group(dfm, groups="event_year")
topfeatures(dfm, 50)
ndoc(dfm)
nfeat(dfm)
topfeatures(dfmy, 300)
dfm_prop <- dfm_weight(dfm, scheme  = "prop")
dfmy_prop <- dfm_group(dfm_prop, groups="event_year")
dfm_uniqueness <- dfm_tfidf(dfm)
freq <- textstat_frequency(dfmy, n = 30, groups = "event_year")
freq <- as.data.frame(freq)
freq$color <- ifelse(freq$feature=="referendum", "red", "grey")
freq$color <- ifelse(freq$feature=="m5s" | freq$feature=="movimento_5_stelle" | freq$feature=="5_stelle" | freq$feature=="cinque stelle", "yellow", freq$color)
freq$color <- ifelse(freq$feature=="beppe" | freq$feature=="grillo" | freq$feature=="beppe_grillo", "darkgreen", freq$color)
freq$color <- ifelse(freq$feature=="elezioni", "blue", freq$color)
# grouped features
feature_group <- read_excel("./../../data_original/M5S/topfeatures_groups.xls")
freq_g <- merge(freq, feature_group, by="feature", all.x=T)
head(freq_g)
freq_g$feature_group[freq_g$feature_group=="generic"] <- NA
freqplot <- freq_g %>%
dplyr::filter(!is.na(feature_group)) %>%
dplyr::rename(year=group) %>%
group_by(year, feature_group) %>%
dplyr::summarise(frequency_group=sum(frequency)) %>%
mutate(rank = dense_rank(frequency_group))
freqplot$referendum_marker <- ifelse(freqplot$feature_group=="referendum", "red", "grey")
freqplot$color <- "grey"
freqplot$color[freqplot$feature_group=="environment"] <- "darkgreen"
freqplot$color[freqplot$feature_group=="m5s"] <- "yellow"
freqplot$color[freqplot$feature_group=="referendum"] <- "darkblue"
freqplot$color[freqplot$feature_group=="populism"] <- "red"
freqplot$color[freqplot$feature_group=="directdemocracy"] <- "orange"
# combine referendum and dd
freq_g2 <- freq_g
freq_g2$feature_group2 <- freq_g2$feature_group
freq_g2$feature_group2[freq_g2$feature_group=="referendum"] <- "directdemocracy"
freqplot2 <- freq_g2 %>%
dplyr::filter(!is.na(feature_group)) %>%
dplyr::rename(year=group) %>%
group_by(year, feature_group2) %>%
dplyr::summarise(frequency_group=sum(frequency), feature_group=first(feature_group)) %>%
mutate(rank = dense_rank(frequency_group))
# color: only placeholder function, replaced with color manual
freqplot2$color <- "grey"
freqplot2$color[freqplot2$feature_group2=="environment"] <- "darkgreen"
freqplot2$color[freqplot2$feature_group2=="m5s"] <- "yellow"
freqplot2$color[freqplot2$feature_group2=="referendum"] <- "darkblue"
freqplot2$color[freqplot2$feature_group2=="populism"] <- "red"
freqplot2$color[freqplot2$feature_group2=="directdemocracy"] <- "orange"
# drop first two years for more compact plot
plot <- ggplot(subset(freqplot2, year>2006&year<2019), aes(y=frequency_group, x=rank)) +
geom_bar(aes(fill=color), position = 'dodge', stat="identity") +
geom_text(aes(label=feature_group2), nudge_y=5000, angle=90) +
facet_wrap(~year) +
ylim(0, 20000) +
ylab("Most Frequent Topics") + xlab("") +
scale_fill_manual(values=c(DB_greenish, DB_grey, DB_red, DB_bluelight, DB_orange, DB_turqoise)) +
theme(axis.text.x = element_blank(), legend.position="none")
print(plot)
ggsave("./../../results/figures/fig3_topic_by_year.pdf", height=10, width=17)
dd <- freqplot %>% dplyr::filter(feature_group=="directdemocracy" | feature_group=="referendum")
ggplot(dd, aes(x=year, y=frequency_group, fill=feature_group)) +
geom_bar(stat="identity") + scale_fill_manual(labels = c("general: direct democracy features", "specific: referendum | costituzione | costituzionale"), values=c(DB_red, DB_blue)) +
ylab("Topic Frequency") + xlab("") +
theme(legend.position = "bottom",
legend.direction="vertical",
legend.title = element_blank())
ggsave("./../../results/figures/figa1_referendum_relevance.pdf", height=5, width=8.5)
# table: grouping of terms
grouping_table <- freq_g2 %>% dplyr::select(feature, feature_group2)
grouping_table <- grouping_table %>% dplyr::rename(feature_group=feature_group2)
grouping_table$feature_group[is.na(grouping_table$feature_group)] <- "generic"
grouping_table <- grouping_table[!duplicated(grouping_table$feature),]
grouping_table <- aggregate(feature ~ feature_group, grouping_table, toString)
total <- freqplot %>% group_by(feature_group) %>% dplyr::summarise(sum=sum(frequency_group)) %>% arrange(sum)
grouping_table <- merge(grouping_table, total, by="feature_group")
grouping_table <- grouping_table %>% dplyr::select(feature_group, feature, sum) %>% arrange(desc(sum))
names(grouping_table) <- c("topic", "terms", "total nr of terms")
grouping_table <- xtable(grouping_table, digits=0)
print(grouping_table, include.rownames=FALSE, file="./../../results/tables/taba1_text_grouping.tex")
#heatmap with counts:
meetupdata <- read.csv("./../../data_original/M5S/0_events_20181206.csv")
# To make the code flexible I simply keep only the rows I care for below:
keep <- c("group_location", "group_lat", "group_lon", "group_country", "local_date", "yes_rsvp_count")
location_df <- meetupdata[keep]
# Re-coding into year, month etc.:
location_df$year <- substring(location_df$local_date,1,4)
location_df$local_date <- as.Date(location_df$local_date)
location_df$ym <- as.yearmon(location_df$local_date)
# Let's use the sf and mapview packages to make a quick check whether all coordinates lay within Italy:
location_df <- location_df %>% filter(group_country == "it", year<=2018)
